{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fine-tuning a model with Keras"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install the Transformers and Datasets libraries to run this notebook."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install datasets transformers[sentencepiece]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datasets import load_dataset\n",
    "from transformers import AutoTokenizer\n",
    "import numpy as np\n",
    "\n",
    "raw_datasets = load_dataset(\"glue\", \"mrpc\")\n",
    "checkpoint = \"bert-base-uncased\"\n",
    "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
    "\n",
    "def tokenize_dataset(dataset):\n",
    "    encoded = tokenizer(\n",
    "        dataset[\"sentence1\"],\n",
    "        dataset[\"sentence2\"],\n",
    "        padding=True,\n",
    "        truncation=True,\n",
    "        return_tensors='np',\n",
    "    )\n",
    "    return encoded.data\n",
    "\n",
    "tokenized_datasets = {\n",
    "    split: tokenize_dataset(raw_datasets[split]) for split in raw_datasets.keys()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import TFAutoModelForSequenceClassification\n",
    "\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.losses import SparseCategoricalCrossentropy\n",
    "\n",
    "model.compile(\n",
    "    optimizer='adam',\n",
    "    loss=SparseCategoricalCrossentropy(from_logits=True),\n",
    "    metrics=['accuracy'],\n",
    ")\n",
    "model.fit(\n",
    "    tokenized_datasets['train'],\n",
    "    np.array(raw_datasets['train']['label']), \n",
    "    validation_data=(\n",
    "        tokenized_datasets['validation'],\n",
    "        np.array(raw_datasets['validation']['label']),\n",
    "    ),\n",
    "    batch_size=8\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tensorflow.keras.optimizers.schedules import PolynomialDecay\n",
    "batch_size = 8\n",
    "num_epochs = 3\n",
    "# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied\n",
    "# by the total number of epochs\n",
    "num_train_steps = (len(tokenized_datasets['train']['input_ids']) // batch_size) * num_epochs\n",
    "lr_scheduler = PolynomialDecay(\n",
    "    initial_learning_rate=5e-5,\n",
    "    end_learning_rate=0.,\n",
    "    decay_steps=num_train_steps\n",
    "    )\n",
    "from tensorflow.keras.optimizers import Adam\n",
    "opt = Adam(learning_rate=lr_scheduler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
    "loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
    "model.compile(optimizer=opt, loss=loss)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "preds = model.predict(tokenized_datasets['validation'])['logits']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(408, 2) (408,)"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "class_preds = np.argmax(preds, axis=1)\n",
    "print(preds.shape, class_preds.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'accuracy': 0.8578431372549019, 'f1': 0.8996539792387542}"
      ]
     },
     "execution_count": null,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from datasets import load_metric\n",
    "\n",
    "metric = load_metric(\"glue\", \"mrpc\")\n",
    "metric.compute(predictions=class_preds, references=raw_datasets['validation']['label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class F1_metric(tf.keras.metrics.Metric):\n",
    "    def __init__(self, name='f1_score', **kwargs):\n",
    "        super().__init__(name=name, **kwargs)\n",
    "        # Initialize our metric by initializing the two metrics it's based on:\n",
    "        # Precision and Recall\n",
    "        self.precision = tf.keras.metrics.Precision()\n",
    "        self.recall = tf.keras.metrics.Recall()\n",
    "\n",
    "    def update_state(self, y_true, y_pred, sample_weight=None):\n",
    "        # Update our metric by updating the two metrics it's based on\n",
    "        class_preds = tf.math.argmax(y_pred, axis=1)\n",
    "        self.precision.update_state(y_true, class_preds, sample_weight)\n",
    "        self.recall.update_state(y_true, class_preds, sample_weight)\n",
    "\n",
    "    def reset_state(self):\n",
    "        self.precision.reset_state()\n",
    "        self.recall.reset_state()\n",
    "\n",
    "    def result(self):\n",
    "        # To get the F1 result, we compute the harmonic mean of the current\n",
    "        # precision and recall\n",
    "        return 2 / ((1 / self.precision.result()) + (1 / self.recall.result())) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)\n",
    "lr_scheduler = PolynomialDecay(\n",
    "    initial_learning_rate=5e-5,\n",
    "    end_learning_rate=0.,\n",
    "    decay_steps=num_train_steps\n",
    "    )\n",
    "opt = Adam(learning_rate=lr_scheduler)\n",
    "model.compile(optimizer=opt, loss=loss, metrics=['accuracy', F1_metric()])\n",
    "model.fit(\n",
    "    tokenized_datasets['train'],\n",
    "    np.array(raw_datasets['train']['label']),\n",
    "    validation_data=(tokenized_datasets['validation'], np.array(raw_datasets['validation']['label'])),\n",
    "    batch_size=8,\n",
    "    epochs=3\n",
    ")"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "name": "Fine-tuning a model with Keras",
   "provenance": []
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
